# Install if not already installed (remove the comment if not installed)
 

#remotes::install_github('susanathey/causalTree') # Uncomment this to install the causalTree package
#remotes::install_github('grf-labs/sufrep') # Uncomment this to install the sufrep package



# Install packages that are not already installed from this list of libraries
library(causalTree)
library(sufrep)
library(causalTree)
library(dplyr)       # Data manipulation (0.8.0.1)
library(fBasics)     # Summary statistics (3042.89)
library(corrplot)    # Correlations (0.84)
library(psych)       # Correlation p-values (1.8.12)
library(grf)         # Generalized random forests (0.10.2)
library(rpart)       # Classification and regression trees, or CART (4.1-13)
library(rpart.plot)  # Plotting trees (3.0.6)
library(treeClust)   # Predicting leaf position for causal trees (1.1-7)
library(car)         # linear hypothesis testing for causal tree (3.0-2)
library(remotes)    # Install packages from github (2.0.1)
library(readr)       # Reading csv files (1.3.1)
library(tidyr)       # Database operations (0.8.3)
library(tibble)      # Modern alternative to data frames (2.1.1)
library(knitr)       # RMarkdown (1.21)
library(kableExtra)  # Prettier RMarkdown (1.0.1)
library(ggplot2)     # general plotting tool (3.1.0)
library(haven)       # read stata files (2.0.0)
library(aod)         # hypothesis testing (1.3.1)
library(evtree)      # evolutionary learning of globally optimal trees (1.0-7)
library(estimatr)    # simple interface for OLS estimation w/ robust std errors ()
library(glmnet)
library(splines)
library(lmtest)
library(sandwich)
library(ggplot2)
library(openxlsx)


  ############################ EDUCATION RANKFIRST ##################
    
  # Set the project directory
  projectdir <- "/Replication package"

  # Paths for different types of files/data
  datapath <- file.path(projectdir, "MainData")
  cfdirpath <- file.path(projectdir, "CausalForest")
  outputpath <- file.path(projectdir, "Output")
  
  # Set the seed for reproducibility
  set.seed(12345)  

  #---------------------------------------#
  # Data Read
  #---------------------------------------#

  # Read the data. Here is the dataset for humans vs. control A, corresponding to 
  # those with preferences for education. The 170 observations that could not be 
  # contacted have been removed."
  file_path <- file.path(datapath, "CausalForest.dta")
  data <- read_dta(file_path)
  
  # Drop rows containing missing values
  df <- na.omit(data)
  # Rename variables: 
  df <- df %>% rename(Y=education_rankfirst,W=treatment)
  
  #save sample
  write.xlsx(df, file.path(cfdirpath, "sample_cf_human.xlsx"), rowNames = FALSE)
  
  #---------------------------------------#
  # Data Split 
  #---------------------------------------#
  
  #train
  train_fraction <- 1  # Use train_fraction % of the dataset to train our models. 
  
  n <- nrow(df)
  train_idx <- sample.int(n, replace=F, size=floor(n*train_fraction))
  df_train <- df[train_idx,]
  df_test <- df[-train_idx,]
  
  # Diving the data 50%-50% into splitting and estimation
  split_size <- floor(nrow(df_train) * 0.5)
  split_idx <- sample(nrow(df_train), replace=FALSE, size=split_size)
  
  # Make the splits
  df_split <- df_train[split_idx,]
  df_est <- df_train[-split_idx,]
  
  # Isolate the "treatment" as a matrix
  W <- as.matrix(df_train$W)
  
  # Covariates: 
  covariates <- c("sexo","prom_notas", "ptje_ranking", "agno_egreso", "municipal",
                  "subvencionado", "particular", "bajo_linea_pobreza", 
                  "padres_superior", "padres_incompleta", "padres_media_completa", "tecnico", "prom_cm_actual" )
  
  
  # formula
  fmla <- formula(paste0("~ 0 +", paste0(covariates, collapse="+")))
  X <- model.matrix(fmla, df_train)
  
  #From now on, my working dataset is df_train 
  n <- nrow(df_train)
  
  # Isolate the outcome as a matrix
  Y <- as.matrix(df_train$Y)
  
  #---------------------------------------#
  # Causal Forest Estimation 
  #---------------------------------------#
  
  cf <- causal_forest(X,Y,W,W.hat= mean(as.numeric(as.character(df_train$W))),
                      num.trees=n) 
  
  #---------------------------------------#
  # Predict values
  #---------------------------------------#
  
  # Get forest predictions. 
  tau.hat <- predict(cf)$predictions 
  m.hat <- cf$Y.hat  
  e.hat <- cf$W.hat  
  tau.hat <- cf$predictions  # tau(X) estimates
  
  #######################################################
  ############# Heterogeneity analysis ##################
  #######################################################
  
  #Now I redefine X with the most important covariates:
  
  
  covariates <- c("sexo","prom_notas", "ptje_ranking","agno_egreso", "municipal", "subvencionado", "particular", 
                  "bajo_linea_pobreza", "padres_superior","prom_cm_actual",  "padres_incompleta", "padres_media_completa")
  
  # formula
  fmla <- formula(paste0("~ 0 +", paste0(covariates, collapse="+")))
  X <- model.matrix(fmla, df_train)
    
  n <- nrow(df_train)
  
  # Number of rankings that the predictions will be ranking on 
  # (e.g., 2 for above/below median estimated CATE, 5 for estimated CATE quintiles, etc.)
  num.rankings <- 5  
  
  # Prepare for data.splitting
  # Assign a fold number to each observation.
  # The argument 'clusters' in the next step will mimick K-fold cross-fitting.
  num.folds <- 10
  folds <- sort(seq(n) %% num.folds) + 1
  
  # Randomized settings with fixed and known probabilities 
  forest <- causal_forest(X, Y, W, W.hat= mean(as.numeric(as.character(df_train$W))), clusters = folds)
  
  # Retrieve out-of-bag predictions.
  # Predictions for observation in fold k will be computed using 
  # trees that were not trained using observations for that fold.
  tau.hat <- predict(forest)$predictions
  
  # Rank observations *within each fold* into quintiles according to their CATE predictions.
  ranking <- rep(NA, n)
  for (fold in seq(num.folds)) {
    tau.hat.quantiles <- quantile(tau.hat[folds == fold], probs = seq(0, 1, by=1/num.rankings))
    ranking[folds == fold] <- cut(tau.hat[folds == fold], tau.hat.quantiles, include.lowest=TRUE,labels=seq(num.rankings))
  }
  
  # Now i see tau-hat ranking 
  
  tau.ranked <- data.frame(tau.hat, ranking)
  
  
  a <- mapply(function(covariate) {
    # Looping over covariate names
    # Compute average covariate value per ranking (with correct standard errors)
    fmla <- formula(paste0(covariate, "~ 0 + ranking"))
    ols <- lm(fmla, data=transform(df_train, ranking=factor(ranking)))
    ols.res <- coeftest(ols, vcov=vcovHC(ols, "HC2"))
    
    # Retrieve results
    avg <- ols.res[,1]
    stderr <- ols.res[,2]
    
    b <- sd(df_train$covariate)
    
    # Tally up results
    data.frame(covariate, avg, stderr, ranking=paste0("Q", seq(num.rankings)), 
               scaling=pnorm((avg - mean(avg))/sd(avg)),
               variation=sd(avg)/b,
               labels=paste0(signif(avg, 3), "\n", "(", signif(stderr, 3), ")"))
  }, covariates, SIMPLIFY = FALSE)
  
  
  a <- do.call(rbind, a)
  
  a$covariate <- reorder(a$covariate, order(a$variation))
  
  #Save results
  write.xlsx(a, file.path(cfdirpath, "education_rankfirst.xlsx"), rowNames = FALSE)
  
  #Clear all
  rm(list = ls())
  


############################ EDUCATION PROPORTION ###################
  
  # Set the project directory
  projectdir <- "/Replication package"
  
  # Paths for different types of files/data
  datapath <- file.path(projectdir, "MainData")
  cfdirpath <- file.path(projectdir, "CausalForest")
  outputpath <- file.path(projectdir, "Output")
  
  # Set the seed for reproducibility
  set.seed(12345)  
  
  #---------------------------------------#
  # Data Read
  #---------------------------------------#
  
  # Read the data. Here is the dataset for humans vs. control A, corresponding to 
  # those with preferences for education. The 170 observations that could not be 
  # contacted have been removed."
  file_path <- file.path(datapath, "CausalForest.dta")
  data <- read_dta(file_path)
  
  # Drop rows containing missing values
  df <- na.omit(data)
  # Rename variables: 
  df <- df %>% rename(Y=education_proportion,W=treatment)
  
  #---------------------------------------#
  # Data Split
  #---------------------------------------#
  
  #train
  train_fraction <- 1  # Use train_fraction % of the dataset to train our models. 
    
  n <- nrow(df)
  train_idx <- sample.int(n, replace=F, size=floor(n*train_fraction))
  df_train <- df[train_idx,]
  df_test <- df[-train_idx,]
  
  # Diving the data 50%-50% into splitting and estimation
  split_size <- floor(nrow(df_train) * 0.5)
  split_idx <- sample(nrow(df_train), replace=FALSE, size=split_size)
  
  # Make the splits
  df_split <- df_train[split_idx,]
  df_est <- df_train[-split_idx,]
  
  # Isolate the "treatment" as a matrix
  W <- as.matrix(df_train$W)
  
  # Covariates: 
  covariates <- c("sexo","prom_notas", "ptje_ranking", "agno_egreso", "municipal",
                  "subvencionado", "particular", "bajo_linea_pobreza", 
                  "padres_superior", "padres_incompleta", "padres_media_completa", "tecnico", "prom_cm_actual" )
  
  
  # formula
  fmla <- formula(paste0("~ 0 +", paste0(covariates, collapse="+")))
  X <- model.matrix(fmla, df_train)
  
  #From now on, my working dataset is df_train 
  n <- nrow(df_train)
  
  # Isolate the outcome as a matrix
  Y <- as.matrix(df_train$Y)
  
  #---------------------------------------#
  # Causal Forest Estimation 
  #---------------------------------------#
  
  cf <- causal_forest(X,Y,W,W.hat= mean(as.numeric(as.character(df_train$W))),
                      num.trees=n) 
  
  #---------------------------------------#
  # Predict values
  #---------------------------------------#
  
  # Get forest predictions. 
  tau.hat <- predict(cf)$predictions 
  m.hat <- cf$Y.hat  
  e.hat <- cf$W.hat  
  tau.hat <- cf$predictions  # tau(X) estimates
  
  
  #######################################################
  ############# Heterogeneity analysis ##################
  #######################################################
  
  #Now I redefine X with the most important covariates:
  
  
  covariates <- c("sexo","prom_notas", "ptje_ranking","agno_egreso", "municipal", "subvencionado", "particular", 
                  "bajo_linea_pobreza","prom_cm_actual", "padres_superior", "padres_incompleta", "padres_media_completa")
  
  # formula
  fmla <- formula(paste0("~ 0 +", paste0(covariates, collapse="+")))
  X <- model.matrix(fmla, df_train)
    
  n <- nrow(df_train)
  
  # Number of rankings that the predictions will be ranking on 
  # (e.g., 2 for above/below median estimated CATE, 5 for estimated CATE quintiles, etc.)
  num.rankings <- 5  
  
  # Prepare for data.splitting
  # Assign a fold number to each observation.
  # The argument 'clusters' in the next step will mimick K-fold cross-fitting.
  num.folds <- 10
  folds <- sort(seq(n) %% num.folds) + 1
  
  # Randomized settings with fixed and known probabilities 
  forest <- causal_forest(X, Y, W, W.hat= mean(as.numeric(as.character(df_train$W))), clusters = folds)
  
  # Retrieve out-of-bag predictions.
  # Predictions for observation in fold k will be computed using 
  # trees that were not trained using observations for that fold.
  tau.hat <- predict(forest)$predictions
  
  # Rank observations *within each fold* into quintiles according to their CATE predictions.
  ranking <- rep(NA, n)
  for (fold in seq(num.folds)) {
    tau.hat.quantiles <- quantile(tau.hat[folds == fold], probs = seq(0, 1, by=1/num.rankings))
    ranking[folds == fold] <- cut(tau.hat[folds == fold], tau.hat.quantiles, include.lowest=TRUE,labels=seq(num.rankings))
  }
  
  # Now i see tau-hat ranking 
  
  tau.ranked <- data.frame(tau.hat, ranking)
  
  
  a <- mapply(function(covariate) {
    # Looping over covariate names
    # Compute average covariate value per ranking (with correct standard errors)
    fmla <- formula(paste0(covariate, "~ 0 + ranking"))
    ols <- lm(fmla, data=transform(df_train, ranking=factor(ranking)))
    ols.res <- coeftest(ols, vcov=vcovHC(ols, "HC2"))
    
    # Retrieve results
    avg <- ols.res[,1]
    stderr <- ols.res[,2]
    
    b <- sd(df_train$covariate)
    
    # Tally up results
    data.frame(covariate, avg, stderr, ranking=paste0("Q", seq(num.rankings)), 
               scaling=pnorm((avg - mean(avg))/sd(avg)),
               variation=sd(avg)/b,
               labels=paste0(signif(avg, 3), "\n", "(", signif(stderr, 3), ")"))
  }, covariates, SIMPLIFY = FALSE)
  
  
  a <- do.call(rbind, a)
  
  a$covariate <- reorder(a$covariate, order(a$variation))
  
  #Save results
  write.xlsx(a, file.path(cfdirpath, "education_proportion.xlsx"), rowNames = FALSE)
  
  #Clear all
  rm(list=ls())
  

  ############################ EDUCATION LISTED ###################
  
  # Set the project directory
  projectdir <- "/Replication package"
  
  # Paths for different types of files/data
  datapath <- file.path(projectdir, "MainData")
  cfdirpath <- file.path(projectdir, "CausalForest")
  outputpath <- file.path(projectdir, "Output")
  
  # Set the seed for reproducibility
  set.seed(12345)  
  
  #---------------------------------------#
  # Data Read
  #---------------------------------------#

  # Read the data. Here is the dataset for humans vs. control A, corresponding to 
  # those with preferences for education. The 170 observations that could not be 
  # contacted have been removed."
  file_path <- file.path(datapath, "CausalForest.dta")
  data <- read_dta(file_path)
  
  # Drop rows containing missing values
  df <- na.omit(data)
  # Rename variables: 
  df <- df %>% rename(Y=education_listed,W=treatment)
  
  #---------------------------------------#
  # Data Split
  #---------------------------------------#
  
  #train
  train_fraction <- 1  # Use train_fraction % of the dataset to train our models. 
    
  n <- nrow(df)
  train_idx <- sample.int(n, replace=F, size=floor(n*train_fraction))
  df_train <- df[train_idx,]
  df_test <- df[-train_idx,]
  
  # Diving the data 50%-50% into splitting and estimation
  split_size <- floor(nrow(df_train) * 0.5)
  split_idx <- sample(nrow(df_train), replace=FALSE, size=split_size)
  
  # Make the splits
  df_split <- df_train[split_idx,]
  df_est <- df_train[-split_idx,]
  
  # Isolate the "treatment" as a matrix
  W <- as.matrix(df_train$W)
  
  # Covariates: 
  
  covariates <- c("sexo","prom_notas", "ptje_ranking", "agno_egreso", "municipal",
                  "subvencionado", "particular", "bajo_linea_pobreza", 
                  "padres_superior", "padres_incompleta", "padres_media_completa", "tecnico", "prom_cm_actual" )
  
  
  # formula
  fmla <- formula(paste0("~ 0 +", paste0(covariates, collapse="+")))
  X <- model.matrix(fmla, df_train)
  
  #From now on, my working dataset is df_train 
  n <- nrow(df_train)
  
  # Isolate the outcome as a matrix
  Y <- as.matrix(df_train$Y)
  
  #---------------------------------------#
  # Causal Forest Estimation 
  #---------------------------------------#
  
  cf <- causal_forest(X,Y,W,W.hat= mean(as.numeric(as.character(df_train$W))),
                      num.trees=n) 
  
  #---------------------------------------#
  # Predict values
  #---------------------------------------#
  
  # Get forest predictions. 
  tau.hat <- predict(cf)$predictions 
  m.hat <- cf$Y.hat  
  e.hat <- cf$W.hat  
  tau.hat <- cf$predictions  # tau(X) estimates
  

  #######################################################
  ############# Heterogeneity analysis ##################
  #######################################################
  
  #Now I redefine X with the most important covariates:
  
  
  covariates <- c("sexo","prom_notas", "ptje_ranking","agno_egreso", "municipal", "subvencionado", "particular", 
                  "bajo_linea_pobreza", "padres_superior", "prom_cm_actual", "padres_incompleta", "padres_media_completa")
  
  # formula
  fmla <- formula(paste0("~ 0 +", paste0(covariates, collapse="+")))
  X <- model.matrix(fmla, df_train)
  
  n <- nrow(df_train)
  
  # Number of rankings that the predictions will be ranking on 
  # (e.g., 2 for above/below median estimated CATE, 5 for estimated CATE quintiles, etc.)
  num.rankings <- 5  
  
  # Prepare for data.splitting
  # Assign a fold number to each observation.
  # The argument 'clusters' in the next step will mimick K-fold cross-fitting.
  num.folds <- 10
  folds <- sort(seq(n) %% num.folds) + 1
  
  # Randomized settings with fixed and known probabilities 
  forest <- causal_forest(X, Y, W, W.hat= mean(as.numeric(as.character(df_train$W))), clusters = folds)
  
  # Retrieve out-of-bag predictions.
  # Predictions for observation in fold k will be computed using 
  # trees that were not trained using observations for that fold.
  tau.hat <- predict(forest)$predictions
  
  # Rank observations *within each fold* into quintiles according to their CATE predictions.
  ranking <- rep(NA, n)
  for (fold in seq(num.folds)) {
    tau.hat.quantiles <- quantile(tau.hat[folds == fold], probs = seq(0, 1, by=1/num.rankings))
    ranking[folds == fold] <- cut(tau.hat[folds == fold], tau.hat.quantiles, include.lowest=TRUE,labels=seq(num.rankings))
  }
  
  # Now i see tau-hat ranking 
  
  tau.ranked <- data.frame(tau.hat, ranking)
  
  
  a <- mapply(function(covariate) {
    # Looping over covariate names
    # Compute average covariate value per ranking (with correct standard errors)
    fmla <- formula(paste0(covariate, "~ 0 + ranking"))
    ols <- lm(fmla, data=transform(df_train, ranking=factor(ranking)))
    ols.res <- coeftest(ols, vcov=vcovHC(ols, "HC2"))
    
    # Retrieve results
    avg <- ols.res[,1]
    stderr <- ols.res[,2]
    
    b <- sd(df_train$covariate)
    
    # Tally up results
    data.frame(covariate, avg, stderr, ranking=paste0("Q", seq(num.rankings)), 
               scaling=pnorm((avg - mean(avg))/sd(avg)),
               variation=sd(avg)/b,
               labels=paste0(signif(avg, 3), "\n", "(", signif(stderr, 3), ")"))
  }, covariates, SIMPLIFY = FALSE)
  
  
  a <- do.call(rbind, a)
  
  a$covariate <- reorder(a$covariate, order(a$variation))
  
  #Save results
  write.xlsx(a, file.path(cfdirpath, "education_listed.xlsx"), rowNames = FALSE)
 
  #Clear all
  rm(list = ls())